Needed packages:
library(data.table)
# Read frome the snapshot, select only columns we're interested in.
matches <- fread(paste0(getwd(), "/data/matches.csv"), select=c("match_id", "match_hometeam_name", "match_awayteam_name", "league_id","match_status","match_hometeam_score", "match_awayteam_score"))
# get only finished english premier league matches
finished_epl_matches <- matches[league_id == 148 & match_status == "Finished"]
finished_epl_matches[, outcome := (ifelse(match_hometeam_score == match_awayteam_score, 0 , ifelse(match_hometeam_score > match_awayteam_score, 1 , -1)))]
finished_epl_matches
# read bets csv
bets <- fread(paste0(getwd(), "/data/bets.csv"))
# add column to indicate if match ended as draw
# get only bets data with win, draw or lose.
bets <- bets[variable %in% c("odd_1", "odd_x", "odd_2" )]
# join bets and match data by match_id
merged_data <- merge(finished_epl_matches, bets, by='match_id')
merged_data
RPS_single<- function(probs,outcomes){
probs = cumsum(probs)
outcomes = cumsum(outcomes)
RPS = sum((probs-outcomes )^2) / (length(probs)-1)
return(RPS)
}
RPS_matrix<- function(probs,outcomes){
probs=as.matrix(probs)
outcomes=as.matrix(outcomes)
probs=t(apply(t(probs), 2, cumsum))
outcomes=t(apply(t(outcomes), 2, cumsum))
RPS = apply((probs-outcomes)^2,1,sum) / (ncol(probs)-1)
return(RPS)
}
# Use this function to analyze for different bookmakers.
bookmaker_draw_analysis <- function(bookmaker_name, data) {
# get the data with selected bookmaker
if(bookmaker_name == TRUE){
bookmaker_bets <- data
}
else{
bookmaker_bets <- data[odd_bookmakers == bookmaker_name]
}
# find probabilities from given odds
bookmaker_bets <- bookmaker_bets[, c("p_win", "p_draw", "p_lose") := list(1/value[variable == 'odd_1'], 1/value[variable == 'odd_x'], 1/value[variable == 'odd_2']), by = list(match_id, odd_bookmakers, variable)]
# group data by match_id and odd_bookmakers, so that we can have only one row for each match and bookmaker
bookmaker_bets <- aggregate(bookmaker_bets[,c("p_win","p_draw","p_lose")], by=list(bookmaker_bets$match_id, bookmaker_bets$odd_bookmakers), FUN = function (x) first(na.omit(x)))
# convert it back to data table
bookmaker_bets <- data.table(bookmaker_bets)
# rename grouped columns
names(bookmaker_bets)[1] <- "match_id"
names(bookmaker_bets)[2] <- "odd_bookmakers"
# TASK 2.2
# convert probabilities by implied probability function for each probability type
bookmaker_bets[, p_win_imp:=p_win / (p_win + p_draw + p_lose)]
bookmaker_bets[, p_draw_imp:=p_draw / (p_win + p_draw + p_lose)]
bookmaker_bets[, p_lose_imp:=p_lose / (p_win + p_draw + p_lose)]
}
rps<- function(probs,outcomes){
probs = cumsum(probs)
outcomes = cumsum(outcomes)
RPS = sum((probs-outcomes )^2) / (length(probs)-1)
return(RPS)
}
past_match_predictions <- bookmaker_draw_analysis(TRUE, merged_data)
past_match_predictions <- past_match_predictions[, -c("p_win","p_draw","p_lose")]
setDT(past_match_predictions, key = "match_id")[finished_epl_matches, outcome := i.outcome]
past_match_predictions
past_match_predictions[,pred_id:=1:.N]
outcome_for_rps=data.table::dcast(past_match_predictions,pred_id~outcome,value.var='pred_id')
outcome_for_rps[,pred_id:=NULL]
outcome_for_rps[is.na(outcome_for_rps)]=0
outcome_for_rps[outcome_for_rps>0]=1
overall_results <- past_match_predictions[,rps := RPS_matrix(past_match_predictions[,list(p_lose_imp,p_draw_imp,p_win_imp)],outcome_for_rps)]
overall_results[,pred_id:=NULL]
overall_results <- data.table(overall_results)
overall_results
bookmaker_rps <- aggregate(overall_results[, rps], list(overall_results$odd_bookmakers), mean)
names(bookmaker_rps)[1] <- "odd_bookmakers"
names(bookmaker_rps)[2] <- "rps_mean"
bookmaker_rps <- data.table(bookmaker_rps)
bookmaker_rps <- bookmaker_rps[order(rps_mean)]
bookmaker_rps
mean(bookmaker_rps$rps_mean)
## [1] 0.2262412
unplayed_epl_matches <- matches[league_id == 148 & is.na(match_status)]
unplayed_epl_matches <- unplayed_epl_matches[,-c("match_status", "match_hometeam_score", "match_awayteam_score", "league_id")]
unplayed_merged_data <- merge(unplayed_epl_matches, bets, by='match_id')
unplayed_predictions <- bookmaker_draw_analysis(TRUE, unplayed_merged_data)
unplayed_predictions <- unplayed_predictions[, -c("p_win","p_draw","p_lose")]
unplayed_predictions <- merge(unplayed_epl_matches, unplayed_predictions, by='match_id')
setDT(unplayed_predictions, key = "odd_bookmakers")[bookmaker_rps, bookmaker_rps_ave := i.rps_mean]
unplayed_predictions
average_cols = c("p_win_imp","p_draw_imp","p_lose_imp")
average_predictions = unplayed_predictions[, lapply(.SD, mean), .SDcols = average_cols, by=match_id]
average_predictions = merge(unplayed_epl_matches, average_predictions, by='match_id')
average_predictions